library("readxl")
x_raw <- read_excel("./kcmillersean-billboard-hot-100-1958-2017/original/Hot 100 Audio Features.xlsx")

-
/
                                                                                                         
sum(is.na(x_raw$spotify_track_explicit))
[1] 4749
x_raw

We eliminate the rows where the target variable is missing.

billboard <- read.csv("./kcmillersean-billboard-hot-100-1958-2017/original/Hot Stuff.csv",header = TRUE)
billboard
x_full <- x_raw[!is.na(x_raw$danceability),]
x_full  <- x_full[!is.na(x_full$spotify_track_explicit),]
sum(is.na(x_full$spotify_track_explicit))
[1] 0
x_full
explicit <- x_full$spotify_track_explicit
x <- x_full[,c(9:22)] 
x
sum(is.na(x))
[1] 0

We have to see what to do with mode and time_signature.

dim(x)
[1] 23676    14

rr hist(x\(speechiness[x\)spotify_track_explicit == FALSE],freq = FALSE,border = ) hist(x\(speechiness[x\)spotify_track_explicit == TRUE],freq = FALSE,add = TRUE,border = )

C <-cor(x)
C
                          spotify_track_duration_ms spotify_track_popularity danceability      energy
spotify_track_duration_ms                1.00000000              0.198109141  0.092919922  0.12737492
spotify_track_popularity                 0.19810914              1.000000000  0.174720923  0.17919595
danceability                             0.09291992              0.174720923  1.000000000  0.20443232
energy                                   0.12737492              0.179195948  0.204432318  1.00000000
key                                      0.01093583              0.003174794  0.013974729  0.02062887
loudness                                 0.04015836              0.343989932  0.132717307  0.68614305
mode                                    -0.12642601             -0.115517992 -0.158627624 -0.10393873
speechiness                              0.04517899              0.203615939  0.246814282  0.14099577
acousticness                            -0.29805079             -0.305762683 -0.313068841 -0.58750719
instrumentalness                         0.01713666             -0.118529461  0.001627687 -0.00113371
liveness                                -0.03407297             -0.071495432 -0.129861234  0.11176775
valence                                 -0.15135401             -0.208846077  0.396338665  0.35604178
tempo                                   -0.01634128              0.024300644 -0.151816967  0.16201218
time_signature                           0.07397847              0.111737410  0.223424383  0.23011661
                                   key     loudness        mode speechiness acousticness instrumentalness
spotify_track_duration_ms  0.010935834  0.040158365 -0.12642601  0.04517899  -0.29805079      0.017136662
spotify_track_popularity   0.003174794  0.343989932 -0.11551799  0.20361594  -0.30576268     -0.118529461
danceability               0.013974729  0.132717307 -0.15862762  0.24681428  -0.31306884      0.001627687
energy                     0.020628866  0.686143053 -0.10393873  0.14099577  -0.58750719     -0.001133710
key                        1.000000000  0.008330687 -0.14400655  0.02637991  -0.02257299      0.003076654
loudness                   0.008330687  1.000000000 -0.07977649  0.17164055  -0.40592734     -0.132649016
mode                      -0.144006548 -0.079776487  1.00000000 -0.13329964   0.14295670     -0.011209945
speechiness                0.026379909  0.171640549 -0.13329964  1.00000000  -0.15490191     -0.056999138
acousticness              -0.022572993 -0.405927340  0.14295670 -0.15490191   1.00000000      0.026802890
instrumentalness           0.003076654 -0.132649016 -0.01120994 -0.05699914   0.02680289      1.000000000
liveness                  -0.002616060  0.043735527  0.01365576  0.07969188   0.04133453     -0.013224803
valence                    0.009441598  0.022982663 -0.01981331 -0.02052001  -0.12336474      0.047569447
tempo                     -0.014142880  0.093918294  0.02027891  0.05687958  -0.10503582      0.003096994
time_signature             0.008179034  0.122478723 -0.05847957  0.08640194  -0.21774293      0.009134502
                             liveness      valence        tempo time_signature
spotify_track_duration_ms -0.03407297 -0.151354007 -0.016341278    0.073978466
spotify_track_popularity  -0.07149543 -0.208846077  0.024300644    0.111737410
danceability              -0.12986123  0.396338665 -0.151816967    0.223424383
energy                     0.11176775  0.356041784  0.162012181    0.230116608
key                       -0.00261606  0.009441598 -0.014142880    0.008179034
loudness                   0.04373553  0.022982663  0.093918294    0.122478723
mode                       0.01365576 -0.019813314  0.020278909   -0.058479570
speechiness                0.07969188 -0.020520014  0.056879580    0.086401941
acousticness               0.04133453 -0.123364745 -0.105035822   -0.217742931
instrumentalness          -0.01322480  0.047569447  0.003096994    0.009134502
liveness                   1.00000000  0.024611119  0.019765001   -0.012604036
valence                    0.02461112  1.000000000  0.072930576    0.144666917
tempo                      0.01976500  0.072930576  1.000000000   -0.017368257
time_signature            -0.01260404  0.144666917 -0.017368257    1.000000000
for(i in 1 : (nrow(C) - 1)) {
  for(j in (i + 1) : ncol(C)) {
    if(abs(C[i, j]) >= 0.5) print(paste0(rownames(C)[i], " - ", colnames(C)[j]))
  }
}
[1] "energy - loudness"
[1] "energy - acousticness"
mu_exp
                                   [,1]
spotify_track_duration_ms  2.328391e+05
spotify_track_popularity   6.139210e+01
danceability               7.203289e-01
energy                     6.612681e-01
key                        5.423867e+00
loudness                  -6.365256e+00
mode                       5.505618e-01
speechiness                1.937759e-01
acousticness               1.422686e-01
instrumentalness           6.431818e-03
liveness                   1.990280e-01
valence                    5.030620e-01
tempo                      1.205554e+02
time_signature             3.998063e+00
mu_nexp = as.matrix(apply(x[explicit == FALSE,],2,mean))
mu_nexp
                                   [,1]
spotify_track_duration_ms  2.199564e+05
spotify_track_popularity   3.800659e+01
danceability               5.833845e-01
energy                     6.128126e-01
key                        5.216497e+00
loudness                  -8.999381e+00
mode                       7.516473e-01
speechiness                5.689042e-02
acousticness               3.148540e-01
instrumentalness           3.653562e-02
liveness                   1.918545e-01
valence                    6.177490e-01
tempo                      1.201533e+02
time_signature             3.922493e+00
apply(x[explicit == TRUE,],2,var)
spotify_track_duration_ms  spotify_track_popularity              danceability                    energy 
             2.944018e+09              2.114043e+02              1.944620e-02              2.283673e-02 
                      key                  loudness                      mode               speechiness 
             1.367608e+01              5.692062e+00              2.475394e-01              1.632485e-02 
             acousticness          instrumentalness                  liveness                   valence 
             3.011615e-02              2.867618e-03              2.262383e-02              4.817320e-02 
                    tempo            time_signature 
             9.131658e+02              7.790322e-02 
apply(x[explicit == FALSE,],2,var)
spotify_track_duration_ms  spotify_track_popularity              danceability                    energy 
             4.833257e+09              4.689667e+02              2.184157e-02              4.186070e-02 
                      key                  loudness                      mode               speechiness 
             1.254519e+01              1.319797e+01              1.866825e-01              3.255749e-03 
             acousticness          instrumentalness                  liveness                   valence 
             8.287255e-02              2.087088e-02              2.588822e-02              5.636188e-02 
                    tempo            time_signature 
             7.683991e+02              1.053511e-01 

Asumim que les variancies son diferents (s’hauria de fer box m test)

sum(explicit == FALSE)
[1] 21095
sum(explicit == TRUE)
[1] 2581
hist(x$loudness[explicit == FALSE],freq = FALSE,border = "green",ylim = c(0,0.2))
hist(x$loudness[explicit == TRUE],freq = FALSE,add  = TRUE,border = "red",ylim = c(0,0.2))

hist(x$speechiness[explicit == FALSE],freq = FALSE,border = "green")
hist(x$speechiness[explicit == TRUE],freq = FALSE,add  = TRUE,border = "red")

hist(x$danceability[explicit == FALSE],freq = FALSE,border = "green")
hist(x$danceability[explicit == TRUE],freq = FALSE,add  = TRUE,border = "red")

hist(x$spotify_track_popularity[explicit == FALSE],freq = FALSE,border = "green",ylim = c(0,0.05))
hist(x$spotify_track_popularity[explicit == TRUE],freq = FALSE,add  = TRUE,border = "red",ylim = c(0,0.05))

hist(x$valence[explicit == FALSE],freq = FALSE,border = "green")
hist(x$valence[explicit == TRUE],freq = FALSE,add  = TRUE,border = "red") 

set.seed(123)
xstd = scale(x,scale = TRUE)
c = sample(1:nrow(xstd),1000)
D = dist(xstd[c,])
hc.ward <- hclust(D,method="ward.D2")
plot(hc.ward,ylab="Distance",main="single linkage (weighted Euclidean)",
xlab="",hang=-1,las=1,cex.main=1)

clusters <- cutree(hc.ward, 2)
table(clusters,explicit[c])
        
clusters FALSE TRUE
       1   629   89
       2   277    5
# LDA
library(MASS)
out <- lda(explicit~.,data=x)
plot(out)

pca = princomp(x[c,],cor=TRUE)
plot(pca$scores[,1],pca$scores[,2],col = ifelse(explicit,"red","blue"),asp = 1)

Classificació

# Creem el dichotomizer
# Primer creem les funcions discriminants
prior_exp = sum(explicit == TRUE)/nrow(x)
prior_nexp = sum(explicit == FALSE)/nrow(x)
S_exp = cov(x[explicit == TRUE,])
S_nexp = cov(x[explicit == FALSE,])
S_exp_inv = solve(S_exp)
S_nexp_inv = solve(S_nexp)
prior_exp
[1] 0.1090133
prior_nexp
[1] 0.8909867

ara ens falta fer learn + test i cross validation

g = function(prior,S_inv,mu,x){
  x = t(as.matrix(x))
  g = log(prior) - log((2*pi)^(ncol(S)/2)) -1/2*t(x-mu) %*% S_inv %*% (x-mu)
  return(g)
}
mal = 0
for (i in 1:nrow(x)){
    dico = g(prior_exp,S_exp_inv,mu_exp,x[1,])-g(prior_nexp,S_nexp_inv,mu_nexp,x[1,])
    if ((dico < 0 && explicit[i]) || (dico > 0 && !explicit[i])) mal = mal + 1
}
mal/nrow(x)*100 # percentatge errors
[1] 10.90133
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KCJyZWFkeGwiKQp4X3JhdyA8LSByZWFkX2V4Y2VsKCIuL2tjbWlsbGVyc2Vhbi1iaWxsYm9hcmQtaG90LTEwMC0xOTU4LTIwMTcvb3JpZ2luYWwvSG90IDEwMCBBdWRpbyBGZWF0dXJlcy54bHN4IikKc3VtKGlzLm5hKHhfcmF3JHNwb3RpZnlfdHJhY2tfZXhwbGljaXQpKQp4X3JhdwpgYGAKV2UgZWxpbWluYXRlIHRoZSByb3dzIHdoZXJlIHRoZSB0YXJnZXQgdmFyaWFibGUgaXMgbWlzc2luZy4KCmBgYHtyfQpiaWxsYm9hcmQgPC0gcmVhZC5jc3YoIi4va2NtaWxsZXJzZWFuLWJpbGxib2FyZC1ob3QtMTAwLTE5NTgtMjAxNy9vcmlnaW5hbC9Ib3QgU3R1ZmYuY3N2IixoZWFkZXIgPSBUUlVFKQpiaWxsYm9hcmQKYGBgCgoKYGBge3J9CnhfZnVsbCA8LSB4X3Jhd1shaXMubmEoeF9yYXckZGFuY2VhYmlsaXR5KSxdCnhfZnVsbCAgPC0geF9mdWxsWyFpcy5uYSh4X2Z1bGwkc3BvdGlmeV90cmFja19leHBsaWNpdCksXQpzdW0oaXMubmEoeF9mdWxsJHNwb3RpZnlfdHJhY2tfZXhwbGljaXQpKQp4X2Z1bGwKYGBgCgpgYGB7cn0KZXhwbGljaXQgPC0geF9mdWxsJHNwb3RpZnlfdHJhY2tfZXhwbGljaXQKeCA8LSB4X2Z1bGxbLGMoOToyMildIApjb2xuYW1lcyh4KVsxXTwtImR1cmF0aW9uIgpjb2xuYW1lcyh4KVsyXTwtInBvcHVsYXJpdHkiCngKc3VtKGlzLm5hKHgpKQpgYGAKV2UgaGF2ZSB0byBzZWUgd2hhdCB0byBkbyB3aXRoIG1vZGUgYW5kIHRpbWVfc2lnbmF0dXJlLgoKYGBge3J9CmRpbSh4KQpgYGAKCmBgYHtyfQojIHBhaXJzKHhbLDE6Nl0sY29sID0gaWZlbHNlKGV4cGxpY2l0LCJyZWQiLCJibHVlIikpCmBgYAoKYGBge3J9CkMgPC1jb3IoeCkKQwpgYGAKCmBgYHtyfQpmb3IoaSBpbiAxIDogKG5yb3coQykgLSAxKSkgewogIGZvcihqIGluIChpICsgMSkgOiBuY29sKEMpKSB7CiAgICBpZihhYnMoQ1tpLCBqXSkgPj0gMC41KSBwcmludChwYXN0ZTAocm93bmFtZXMoQylbaV0sICIgLSAiLCBjb2xuYW1lcyhDKVtqXSkpCiAgfQp9CmBgYAoKCgpgYGB7cn0KbXVfZXhwID0gYXMubWF0cml4KGFwcGx5KHhbZXhwbGljaXQgPT0gVFJVRSxdLDIsbWVhbikpCm11X2V4cApgYGAKCmBgYHtyfQptdV9uZXhwID0gYXMubWF0cml4KGFwcGx5KHhbZXhwbGljaXQgPT0gRkFMU0UsXSwyLG1lYW4pKQptdV9uZXhwCmBgYAoKYGBge3J9CmFwcGx5KHhbZXhwbGljaXQgPT0gVFJVRSxdLDIsdmFyKQpgYGAKCmBgYHtyfQphcHBseSh4W2V4cGxpY2l0ID09IEZBTFNFLF0sMix2YXIpCmBgYAoKQXN1bWltIHF1ZSBsZXMgdmFyaWFuY2llcyBzb24gZGlmZXJlbnRzIChzJ2hhdXJpYSBkZSBmZXIgYm94IG0gdGVzdCkKCmBgYHtyfQpzdW0oZXhwbGljaXQgPT0gRkFMU0UpCnN1bShleHBsaWNpdCA9PSBUUlVFKQpgYGAKYGBge3J9Cmhpc3QoeCRsb3VkbmVzc1tleHBsaWNpdCA9PSBGQUxTRV0sZnJlcSA9IEZBTFNFLGJvcmRlciA9ICJncmVlbiIseWxpbSA9IGMoMCwwLjIpKQpoaXN0KHgkbG91ZG5lc3NbZXhwbGljaXQgPT0gVFJVRV0sZnJlcSA9IEZBTFNFLGFkZCAgPSBUUlVFLGJvcmRlciA9ICJyZWQiLHlsaW0gPSBjKDAsMC4yKSkKCgpgYGAKCmBgYHtyfQpoaXN0KHgkc3BlZWNoaW5lc3NbZXhwbGljaXQgPT0gRkFMU0VdLGZyZXEgPSBGQUxTRSxib3JkZXIgPSAiZ3JlZW4iKQpoaXN0KHgkc3BlZWNoaW5lc3NbZXhwbGljaXQgPT0gVFJVRV0sZnJlcSA9IEZBTFNFLGFkZCAgPSBUUlVFLGJvcmRlciA9ICJyZWQiKQpgYGAKYGBge3J9Cmhpc3QoeCRkYW5jZWFiaWxpdHlbZXhwbGljaXQgPT0gRkFMU0VdLGZyZXEgPSBGQUxTRSxib3JkZXIgPSAiZ3JlZW4iKQpoaXN0KHgkZGFuY2VhYmlsaXR5W2V4cGxpY2l0ID09IFRSVUVdLGZyZXEgPSBGQUxTRSxhZGQgID0gVFJVRSxib3JkZXIgPSAicmVkIikKYGBgCmBgYHtyfQpoaXN0KHgkc3BvdGlmeV90cmFja19wb3B1bGFyaXR5W2V4cGxpY2l0ID09IEZBTFNFXSxmcmVxID0gRkFMU0UsYm9yZGVyID0gImdyZWVuIix5bGltID0gYygwLDAuMDUpKQpoaXN0KHgkc3BvdGlmeV90cmFja19wb3B1bGFyaXR5W2V4cGxpY2l0ID09IFRSVUVdLGZyZXEgPSBGQUxTRSxhZGQgID0gVFJVRSxib3JkZXIgPSAicmVkIix5bGltID0gYygwLDAuMDUpKQpgYGAKYGBge3J9Cmhpc3QoeCR2YWxlbmNlW2V4cGxpY2l0ID09IEZBTFNFXSxmcmVxID0gRkFMU0UsYm9yZGVyID0gImdyZWVuIikKaGlzdCh4JHZhbGVuY2VbZXhwbGljaXQgPT0gVFJVRV0sZnJlcSA9IEZBTFNFLGFkZCAgPSBUUlVFLGJvcmRlciA9ICJyZWQiKSAKYGBgCgoKYGBge3J9CnNldC5zZWVkKDEyMykKeHN0ZCA9IHNjYWxlKHgsc2NhbGUgPSBUUlVFKQpjID0gc2FtcGxlKDE6bnJvdyh4c3RkKSwxMDAwKQpEID0gZGlzdCh4c3RkW2MsXSkKYGBgCgoKCmBgYHtyfQpoYy53YXJkIDwtIGhjbHVzdChELG1ldGhvZD0id2FyZC5EMiIpCnBsb3QoaGMud2FyZCx5bGFiPSJEaXN0YW5jZSIsbWFpbj0ic2luZ2xlIGxpbmthZ2UgKHdlaWdodGVkIEV1Y2xpZGVhbikiLAp4bGFiPSIiLGhhbmc9LTEsbGFzPTEsY2V4Lm1haW49MSkKY2x1c3RlcnMgPC0gY3V0cmVlKGhjLndhcmQsIDIpCnRhYmxlKGNsdXN0ZXJzLGV4cGxpY2l0W2NdKQpgYGAKCmBgYHtyfQojIExEQQpsaWJyYXJ5KE1BU1MpCm91dCA8LSBsZGEoZXhwbGljaXR+LixkYXRhPXgpCnBsb3Qob3V0KQpgYGAKCgpgYGB7cn0KIyBQQ0Egbm8gc2VwYXJhCnBjYSA9IHByaW5jb21wKHhbYyxdLGNvcj1UUlVFKQpwbG90KHBjYSRzY29yZXNbLDFdLHBjYSRzY29yZXNbLDJdLGNvbCA9IGlmZWxzZShleHBsaWNpdCwicmVkIiwiYmx1ZSIpLGFzcCA9IDEpCmBgYAoKCiMjIENsYXNzaWZpY2FjacOzCgpgYGB7cn0KIyBDcmVlbSBlbCBkaWNob3RvbWl6ZXIKIyBQcmltZXIgY3JlZW0gbGVzIGZ1bmNpb25zIGRpc2NyaW1pbmFudHMKcHJpb3JfZXhwID0gc3VtKGV4cGxpY2l0ID09IFRSVUUpL25yb3coeCkKcHJpb3JfbmV4cCA9IHN1bShleHBsaWNpdCA9PSBGQUxTRSkvbnJvdyh4KQpTX2V4cCA9IGNvdih4W2V4cGxpY2l0ID09IFRSVUUsXSkKU19uZXhwID0gY292KHhbZXhwbGljaXQgPT0gRkFMU0UsXSkKU19leHBfaW52ID0gc29sdmUoU19leHApClNfbmV4cF9pbnYgPSBzb2x2ZShTX25leHApCnByaW9yX2V4cApwcmlvcl9uZXhwCmBgYAoKCgphcmEgZW5zIGZhbHRhIGZlciBsZWFybiArIHRlc3QgaSBjcm9zcyB2YWxpZGF0aW9uCgpgYGB7cn0KZyA9IGZ1bmN0aW9uKHByaW9yLFNfaW52LG11LHgpewogIHggPSB0KGFzLm1hdHJpeCh4KSkKICBnID0gbG9nKHByaW9yKSAtIGxvZygoMipwaSleKG5jb2woUykvMikpIC0xLzIqdCh4LW11KSAlKiUgU19pbnYgJSolICh4LW11KQogIHJldHVybihnKQp9CgptYWwgPSAwCmZvciAoaSBpbiAxOm5yb3coeCkpewogICAgZGljbyA9IGcocHJpb3JfZXhwLFNfZXhwX2ludixtdV9leHAseFsxLF0pLWcocHJpb3JfbmV4cCxTX25leHBfaW52LG11X25leHAseFsxLF0pCiAgICBpZiAoKGRpY28gPCAwICYmIGV4cGxpY2l0W2ldKSB8fCAoZGljbyA+IDAgJiYgIWV4cGxpY2l0W2ldKSkgbWFsID0gbWFsICsgMQp9Cm1hbC9ucm93KHgpKjEwMCAjIHBlcmNlbnRhdGdlIGVycm9ycwpgYGAKCgoKCgoK